################################################################################################
################################################################################################
################################################################################################
#   Project: Nightlight. Johannes, Ryan, Eugenie
#   Objective: cleaning and merging data from census and nighlights
#   Python 2.7.11 |Anaconda 2.5.0 (64-bit)
#   [GCC 4.4.7 20120313 (Red Hat 4.4.7-1)] on linux2
################################################################################################
################################################################################################
################################################################################################

#######################################################################################################################
###     CONCORDANCE FILE
#######################################################################################################################
# we have to merge many different datasets.
# they all have different types of code. so let's start by getting the correct codes with the concordance file
import pandas as pd
import numpy as np
import os
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_rows', 55)
pd.set_option('display.max_columns', 250)
os.chdir('/home/emd/Dropbox/Night Lights Validation (Eugenie, Ryan, Johannes)/Data/Census_MERGED/')
concordance = pd.read_stata('concordance.dta')
concordance.columns = concordance.iloc[0,:]
concordance = concordance.drop(concordance.index[0])
concordance.dtypes
concordance.columns
concordance.describe()
concordance = concordance.replace({'': np.nan}).replace({' ': np.NaN}).replace({'NA': np.NaN}).replace({'0': np.nan})

# RENAMING VARIABLES
concordance = concordance.rename(columns={'STC 2001': 'St01', 'DTC 2001': 'Dis01', 'Sub-DT 2001': 'Subd01',
       'PLCN 2001': 'Vill01', 'NAME OF STATE, DISTRICTS, SUB-DISTTS. & VILLAGES 2001': 'Name01', 'MDDS STC': 'St11',
       'MDDS DTC': 'Dis11', 'MDDS Sub_DT': 'Subd11', 'MDDS PLCN': 'Vill11',
       'MDDS NAME OF STATE, DISTRICT, SUB-DISTTS. & VILLAGES': 'Name11'})

# CHECKING WHETHER THERE IS MISSING DATA FOR THE 2011 VARIABLES
concordance[pd.isnull(concordance['Vill11'])] # observations where Vill11 is NaN: zero # Vill11 is never missing or nan
concordance[pd.isnull(concordance['St11'])]  # no missing values
concordance[pd.isnull(concordance['Dis11'])]  # no missing values
concordance[pd.isnull(concordance['Subd11'])]   # no missing values

# CLEANING ALL THE CODES
list = ['St01', 'Dis01', 'Subd01', 'Vill01', 'St11', 'Dis11', 'Subd11', 'Vill11']
dict_correctlength = {}
dict_correctlength['St01'] = 2
dict_correctlength['Dis01'] = 2
dict_correctlength['Subd01'] = 4
dict_correctlength['Vill01'] = 8
dict_correctlength['St11'] = 2
dict_correctlength['Dis11'] = 3
dict_correctlength['Subd11'] = 5
dict_correctlength['Vill11'] = 6

for var in list:
    length = dict_correctlength[var]
    concordance[var] = concordance[var].astype(np.str)
    concordance[var] = length * '0' + concordance[var]
    concordance[var] = concordance[var].str[-length:]
    print 'Variable:', var, 'Length=', length
    print concordance[var].str.len().value_counts()

concordance['StateVillCode01'] = concordance['St01'] + concordance['Vill01']
concordance['StateVillCode01'].describe()  # 636073 / 652511
concordance['StateVillCode11'] = concordance['St11'] + concordance['Vill11']
concordance['StateVillCode11'].describe()
concordance['FullCode01'] = concordance['St01'] + concordance['Dis01'] + concordance['Subd01'] + concordance['Vill01']
concordance['FullCode01'].describe()
concordance['FullCode11'] = concordance['St11'] + concordance['Dis11'] + concordance['Subd11'] + concordance['Vill11']
concordance['FullCode11'].describe()

# FINDING CODES THAT HAVE DUPLICATE OBSERVATIONS
codes = concordance['FullCode11'].value_counts()[concordance['FullCode11'].value_counts()>1].index.values
concordance[concordance['FullCode11'].isin(codes)].sort_values(by='FullCode11') # 479 rows
# drop the observations that have exactly the same FullCode11
concordance = concordance[~concordance.duplicated(['FullCode11'])]

concordance['FullCode11'].describe() # 652250 / 652250
concordance['Vill11'].describe() # 645594 / 652250
# investigating for which observation Vill11 does not uniquely identify
concordance[concordance.duplicated(['Vill11'])].sort_values(by='FullCode11') # 6656 rows
# Vill11 with '000000' are urbanized centers. Vill11 will not be unique for these
concordance = concordance[concordance['Vill11'] != '000000']
concordance['Vill11'].describe() # 645593 / 645615. ok good enough # 44 observations in total
# dropping the duplicates
concordance['Vill11'].value_counts()
concordance[concordance.duplicated(['Vill11'])].sort_values(by='Vill11')
codes_dupl = concordance['Vill11'].value_counts()[concordance['Vill11'].value_counts()>1].index.values
concordance = concordance[~concordance['Vill11'].isin(codes_dupl)]
concordance['Vill11'].describe() # 645571 / 645571
concordance['StateVillCode01'].describe()  # 635929 / 645571


#######################################################################################################################
###     NIGHTLIGHT DATA
#######################################################################################################################
# there are three files containing NL data:
# - File 1: Shape file data + one pixel(1km) + bilinear interpolation (4 pixels)
# - File 2: data for 2km, 3km and 5km perimeter
# - File 3: data from Brian Min. IndiaIO API
import pandas as pd
import numpy as np
import os
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_rows', 95)
pd.set_option('display.max_columns', 250)
os.chdir('/home/emd/Dropbox/Night Lights Validation (Eugenie, Ryan, Johannes)/Data/Python_code_merging/')

# File 1
NL_shape_file = '/home/emd/Dropbox/Night Lights Validation (Eugenie, Ryan, Johannes)/Data/Villages 2011/Villages2011Lights/v2011wpts.csv'
NL_shape_data = pd.read_csv(NL_shape_file, header=0, na_values=['NA'], low_memory=False)
NL_shape_data = NL_shape_data.replace({'': np.NaN}).replace({' ': np.NaN}).replace({'NA': np.NaN})
NL_shape_data = NL_shape_data.drop_duplicates()
# About ['CENSUS2011'] = state code (2 digits) + district (3 digits) + subdistrict (5 digits) + village (6 digits)
# the village code should be a unique identifier
# cleaning CENSUS2011
NL_shape_data.dtypes
NL_shape_data['CENSUS2011'] = NL_shape_data['CENSUS2011'].astype(np.str)
NL_shape_data['CENSUS2011'].value_counts()
NL_shape_data['CENSUS2011'].str.len().value_counts()
# 176 observations don t have 16 digits. some less some more. i drop those
mask = NL_shape_data['CENSUS2011'].str.len() == 16
NL_shape_data = NL_shape_data.loc[mask]
NL_shape_data['CENSUS2011'].describe() # 630829/630901


# File 2
NL_perimeter_file = '/home/emd/Dropbox/Night Lights Validation (Eugenie, Ryan, Johannes)/Data/Villages 2011/vZonesall.csv'
NL_perimeter_data = pd.read_csv(NL_perimeter_file, header=0, na_values=['NA'], low_memory=False)
NL_perimeter_data = NL_perimeter_data.replace({'': np.NaN}).replace({' ': np.NaN}).replace({'NA': np.NaN})
NL_perimeter_data = NL_perimeter_data.drop_duplicates()
# cleaning CENSUS2011
NL_perimeter_data.dtypes
NL_perimeter_data['CENSUS2011'] = NL_perimeter_data['CENSUS2011'].astype(np.str)
NL_perimeter_data['CENSUS2011'].value_counts()
NL_perimeter_data['CENSUS2011'].str.len().value_counts()
# 172 observations don t have 16 digits. some less some more. i drop those
mask = NL_perimeter_data['CENSUS2011'].str.len() == 16
NL_perimeter_data = NL_perimeter_data.loc[mask]
NL_perimeter_data['CENSUS2011'].describe() # 515823/515823



# merging the two NL files together
print NL_perimeter_data['CENSUS2011'].describe() # 515823/515823
print NL_shape_data['CENSUS2011'].describe() # 630829/630901
print NL_perimeter_data['CENSUS2011'].str.len().value_counts(), '\n', NL_shape_data['CENSUS2011'].str.len().value_counts()
print NL_perimeter_data['CENSUS2011'].dtype, '\n', NL_shape_data['CENSUS2011'].dtype

NL_data_shper = pd.merge(NL_shape_data, NL_perimeter_data, how='outer', left_on='CENSUS2011', right_on='CENSUS2011',
                         suffixes=('_sh', '_per'), indicator=True)
NL_data_shper['_merge'].value_counts()
# both           515895
# left_only      115006
# right_only          0

# keeping only relevant columns from the NL file
# are areas changing across years? no. keep only 'AREA2011'
test = (NL_data_shper['ZONE_CODE2011'] == NL_data_shper['ZONE_CODE2010']) | NL_data_shper['ZONE_CODE2010'].isnull()
test.value_counts()

namechanges = {}
for name in NL_data_shper.columns.values:
    if name not in ['Vill11', 'CENSUS2011']:
        namechanges[name] = name.lower().replace('20', '').replace('09', '_09').replace('10', '_10').replace('11', '_11')
        if name in NL_data_shper.columns.values[2:41]:
            namechanges[name] = 'SH_' + namechanges[name]
        if ('2k' in name) or ('3k' in name) or ('5k' in name):
            namechanges[name] = namechanges[name][-2] + 'K_' + namechanges[name][:-3]
        if ('ptval' in name) or ('biptval' in name):
            namechanges[name] = namechanges[name][:2].upper() + namechanges[name][2:]
        print name, namechanges[name]
NL_data_shper = NL_data_shper.rename(columns=namechanges)
NL_data_shper = NL_data_shper.rename(columns={'SH_area_11': 'SH_area'})
NL_data_shper = NL_data_shper.rename(columns={'2K_area_11': '2K_area'})
NL_data_shper = NL_data_shper.rename(columns={'3K_area_11': '3K_area'})
NL_data_shper = NL_data_shper.rename(columns={'5K_area_11': '5K_area'})

NL_data_shper = NL_data_shper.drop(['SH_area_09', 'SH_area_10', '2K_area_09', '2K_area_10', '3K_area_09', '3K_area_10',
                        '5K_area_09', '5K_area_10'], axis=1)
list_words = ['min', 'max', 'mean', 'std', 'sum', 'median', 'count', 'variety', 'majority', 'minority',
              'SH_area', '2K_area', '3K_area', '5K_area', 'PTval', 'BIptval', 'CENSUS2011']
col_to_keep = []
for item in NL_data_shper.columns.values:
    if any(words in item for words in list_words):
        col_to_keep.append(item)
NL_data_shper = NL_data_shper[col_to_keep]

# construct 2011 village code. we'll merge on it later
NL_data_shper['Vill11'] = NL_data_shper['CENSUS2011'].str[-6:]
NL_data_shper['Vill11'].str.len().value_counts()
NL_data_shper['Vill11'].value_counts()
NL_data_shper['Vill11'].describe() # 630643 / 630901
nonunique_codes = NL_data_shper['Vill11'].value_counts()[NL_data_shper['Vill11'].value_counts() > 1].index.values
len(nonunique_codes) # 153
nonunique_codes.sort()
mask = NL_data_shper['Vill11'].isin(nonunique_codes)
mask.value_counts()
NL_data_shper.loc[mask].sort_values(by='Vill11')
NL_data_shper_dropped = NL_data_shper.loc[mask]
NL_data_shper = NL_data_shper.loc[~mask]
NL_data_shper['Vill11'].describe() # 630490 / 630490

# let's save the list of these redundant codes in a pkl file
# and let's save the observations corresponding to these codes in csv files
# import cPickle
# filepath = '/home/emd/Dropbox/Night Lights Validation (Eugenie, Ryan, Johannes)/Data/Python_code_merging/list_codeswduplicates_inNLshape.pkl'
# with open(filepath, 'wb') as file:
#     cPickle.dump(nonunique_codes, file)
# filepath = '/home/emd/Dropbox/Night Lights Validation (Eugenie, Ryan, Johannes)/Data/Python_code_merging/NL_shape_data_dropped.csv'
# NL_shape_data_dropped.to_csv(filepath, sep=',', header=True, index=False, na_rep='')

col_to_keep = NL_data_shper.columns.values.tolist()
col_to_keep.insert(0, col_to_keep[-1])
col_to_keep = col_to_keep[:-1]
NL_data_shper = NL_data_shper[col_to_keep]

# SUMMARY
print NL_data_shper['Vill11'].describe()
# 630490 observations uniquely identified by Vill11
# We started with 631101 and had to drop some because either code did not have the correct length or were duplicates


# File 3:
NL_api_file = '/home/emd/Dropbox/Night Lights Validation (Eugenie, Ryan, Johannes)/Data/IndiaIO/ltsio(census).csv'
NL_api_data = pd.read_csv(NL_api_file, header=0, na_values=['NA'])
NL_api_data['villagecode'] = NL_api_data['villagecode'].astype(np.int)
NL_api_data['villagecode'] = NL_api_data['villagecode'].astype(np.str)
print NL_api_data['villagecode'].str.len().value_counts()
NL_api_data['villagecode'] = '0' + NL_api_data['villagecode']
NL_api_data['villagecode'] = NL_api_data['villagecode'].str[-16:]
print NL_api_data['villagecode'].describe() # 591935/5622893

# grouping / collapsing
grouped = NL_api_data.groupby(['villagecode', 'year'], as_index=False)
NL_api_data_gr = grouped.agg({'vis_max': 'max', 'vis_min': 'min', 'vis_mean': 'mean',
                                    'vis_median': 'mean', 'vis_sd': 'mean', 'count': 'sum'})
NL_api_data_gr = NL_api_data_gr.rename(columns={'vis_max': 'max', 'vis_min': 'min', 'vis_mean': 'mean',
                                    'vis_median': 'mean_ofmonthly_median', 'vis_sd': 'mean_ofmonthly_sd', 'count': 'count'})
NL_api_data_gr = NL_api_data_gr[['villagecode', 'year', 'mean', 'min', 'max', 'mean_ofmonthly_median', 'mean_ofmonthly_sd', 'count']]

NL_api_data_all, colchanges = {}, {}
for year in [2009, 2010, 2011]:
    NL_api_data_gr_yr = NL_api_data_gr[NL_api_data_gr['year'] == year]
    del NL_api_data_gr_yr['year']
    for name in NL_api_data_gr.columns[2:]:
        colchanges[name] = 'API_' + name + '_' + str(year)
    NL_api_data_gr_yr = NL_api_data_gr_yr.rename(columns=colchanges)
    NL_api_data_all[year] = NL_api_data_gr_yr

# merging
NL_api_data = pd.merge(NL_api_data_all[2009], NL_api_data_all[2010], how='outer', on='villagecode', indicator=True)
print NL_api_data['_merge'].value_counts()
NL_api_data = NL_api_data.rename(columns={'_merge': 'merge_09_10'})
NL_api_data = pd.merge(NL_api_data, NL_api_data_all[2011], how='outer', on='villagecode', indicator=True)
print NL_api_data['_merge'].value_counts()
NL_api_data = NL_api_data.rename(columns={'_merge': 'merge_0910_11'})
del NL_api_data['merge_09_10']
del NL_api_data['merge_0910_11']

# THE API NL DATA IS IDENTIFIED BY 2001 CODES. THE SHAPE FILE DATA IS IDENTIFIED BY 2011 CODES.
# first we need to merge the api data with concordance to get 2011 codes
NL_api_data = pd.merge(NL_api_data, concordance, how='outer', left_on='villagecode', right_on='FullCode01', indicator=True)
print NL_api_data['_merge'].value_counts()
# both          599355
# right_only     46216
# left_only        107
NL_api_data = NL_api_data[NL_api_data['_merge'] == 'both']
del NL_api_data['_merge']
collist = ['FullCode01', 'FullCode11', 'Vill11']
collist.extend(NL_api_data.columns.values[1:-14].tolist())
NL_api_data = NL_api_data[collist]

# merging file 3 with the other NL files
print NL_api_data['Vill11'].describe() # 599355/599355
print NL_data_shper['Vill11'].describe() # 630490/630490
# the 2 codes must be 6 digits long and strings before merging!
print NL_api_data['Vill11'].str.len().value_counts(), '\n', NL_data_shper['Vill11'].str.len().value_counts()
print NL_api_data['Vill11'].dtype, '\n', NL_data_shper['Vill11'].dtype

NL_data = pd.merge(NL_data_shper, NL_api_data, how='outer', on='Vill11', indicator=True)
NL_data['_merge'].value_counts()
# both          582462
# left_only      48028
# right_only     16893
NL_data = NL_data.rename(columns={'_merge': 'merge_sh_api'})
NL_data.columns.values

# summary NL data source. shape / perimeter / api
NL_data['NL_source Sh'] = ''
NL_data['NL_source Pt'] = ''
NL_data['NL_source Bpt'] = ''
NL_data['NL_source Per2k'] = ''
NL_data['NL_source Per3k'] = ''
NL_data['NL_source Per5k'] = ''
NL_data['NL_source Api'] = ''
NL_data.loc[:, 'NL_source Sh'][NL_data.loc[:, 'SH_sum_11'].notnull()] = 'Sh'
NL_data.loc[:, 'NL_source Pt'][NL_data.loc[:, 'PTval_11'].notnull()] = 'Pt'
NL_data.loc[:, 'NL_source Bpt'][NL_data.loc[:, 'BIptval_11'].notnull()] = 'Bpt'
NL_data.loc[:, 'NL_source Per2k'][NL_data.loc[:, '_2K_sum_11'].notnull()] = 'Per2k'
NL_data.loc[:, 'NL_source Per3k'][NL_data.loc[:, '_3K_sum_11'].notnull()] = 'Per3k'
NL_data.loc[:, 'NL_source Per5k'][NL_data.loc[:, '_5K_sum_11'].notnull()] = 'Per5k'
# NL_data.loc[:, 'NL_source Per2k'][NL_data.loc[:, '2K_sum_11'].notnull()] = 'Per2k'
# NL_data.loc[:, 'NL_source Per3k'][NL_data.loc[:, '3K_sum_11'].notnull()] = 'Per3k'
# NL_data.loc[:, 'NL_source Per5k'][NL_data.loc[:, '5K_sum_11'].notnull()] = 'Per5k'
NL_data.loc[:, 'NL_source Api'][NL_data.loc[:, 'API_mean_2011'].notnull()] = 'Api'



NL_data['NL_source'] = NL_data['NL_source Sh'] + NL_data['NL_source Pt'] + NL_data['NL_source Bpt'] +\
                       NL_data['NL_source Per2k'] + NL_data['NL_source Per3k'] + NL_data['NL_source Per5k'] + NL_data['NL_source Api']
NL_data['NL_source'].value_counts()
# ShPtBptPer2kPer3kPer5kApi    362330
# ShPtBptApi                    69628
# ShPtBptPer2kPer3kApi          42035
# ShPtBptPer2kApi               29518
# PtBptApi                      29507
# ShPtBptPer2kPer3kPer5k        19558
# Api                           16893
# PtBptPer2kPer3kPer5kApi       16867
# PtBpt                          8118
# ShPtBpt                        7731
# ShPtBptPer2kPer5kApi           5700
# ShPtBptPer3kApi                5510
# PtBptPer2kPer3kApi             5321
# PtBptPer2kApi                  5060
# ShPtBptPer5kApi                3830
# ShPtBptPer2kPer3k              3123
# ShPtBptPer3kPer5kApi           2957
# PtBptPer2kPer3kPer5k           2604
# ShPtBptPer2k                   2526
#                                2115
# PtBptPer3kApi                  1423
# PtBptPer2k                     1081
# PtBptPer2kPer3k                1040
# PtBptPer5kApi                  1028
# PtBptPer2kPer5kApi             1018
# PtBptPer3kPer5kApi              730
# ShPtBptPer3k                    450
# ShPtBptPer2kPer5k               407
# ShPtBptPer5k                    359
# PtBptPer3k                      270
# ShPtBptPer3kPer5k               241
# PtBptPer5k                      208
# PtBptPer2kPer5k                 208
# PtBptPer3kPer5k                 104

NL_data = NL_data.drop(['merge_sh_api', 'FullCode11', 'CENSUS2011'], axis=1)
NL_data = NL_data.drop(['NL_source Sh', 'NL_source Per2k', 'NL_source Per3k', 'NL_source Per5k', 'NL_source Api', 'NL_source Pt', 'NL_source Bpt'], axis=1)
NL_data.columns.values


###     MERGING CONCORDANCE AND NIGHTLIGHT DATA
print concordance['Vill11'].describe() # 645571/645571
print NL_data['Vill11'].describe() # 647383/647383
# the 2 codes must be 6 digits long and strings before merging!
print concordance['Vill11'].str.len().value_counts(), '\n', NL_data['Vill11'].str.len().value_counts()
print concordance['Vill11'].dtype, '\n', NL_data['Vill11'].dtype

mydata = pd.merge(concordance, NL_data, how='outer', left_on='Vill11', right_on='Vill11',
                         suffixes=('_conc', '_nl'), indicator=True)
mydata['_merge'].value_counts()
# both          643468
# left_only       3915
# right_only      2103

mydata = mydata.rename(columns={'_merge': 'merge_NL_conc'})
mydata['merge_NL_conc'].cat.categories
mydata['merge_NL_conc'] = mydata['merge_NL_conc'].cat.rename_categories(['conc_only', 'NL_only', 'both_conc_NL'])

# saving the NL observation not matched in conc
filepath = '/home/emd/Dropbox/Night Lights Validation (Eugenie, Ryan, Johannes)/Data/Python_code_merging/Obs_inNL_notinConcord.csv'
mydata[mydata['merge_NL_conc'] == 'NL_only'].to_csv(filepath, sep=',', header=True, index=False, na_rep='')

#######################################################################################################################
###     CENSUS DATA
#######################################################################################################################
import pandas as pd
import os
import rpy2.robjects as robjects
import pandas.rpy.common as com
import numpy as np
# Setting the width of printing large enough
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_rows', 250)
pd.set_option('display.max_columns', 250)
os.chdir('/home/emd/Dropbox/Night Lights Validation (Eugenie, Ryan, Johannes)/Data/Python_code_merging/')
os.chdir('/home/emd/Dropbox/Night Lights Validation (Eugenie, Ryan, Johannes)/Data/Census_MERGED/')
print robjects.r.load("census.RData")
census_full = com.load_data('final')
os.chdir('/home/emd/Dropbox/Night Lights Validation (Eugenie, Ryan, Johannes)/Data/Python_code_merging/')
print 'a'
name_variables = ['c01_2001_st_code', 'c01_2001_dist_code', 'c01_2001_subdist_code', 'c01_2001_block_code', 'c01_2001_vill_code',
                  'c01_2001_vill_name', 'c11_2011_vill_code',

                  'c01_2001_area', 'c01_2001_t_hh', 'c01_2001_t_p', "c01_2001_p_lit",
                  'c01_2001_sc_p', 'c01_2001_st_p', 'c01_2001_edu_fac', 'c01_2001_medi_fac', 'c01_2001_drnk_wat_f',
                  'c01_2001_tap', 'c01_2001_p_t_fac', 'c01_2001_bs_fac', 'c01_2001_rs_fac', 'c01_2001_nw_fac',
                  'c01_2001_bank_fac', 'c01_2001_rc_fac', 'c01_2001_c_v_hall', 'c01_2001_app_pr', 'c01_2001_dist_town',
                  'c01_2001_power_supl', 'c01_2001_power_dom', 'c01_2001_power_agr', 'c01_2001_power_oth', 'c01_2001_power_all',
                  'c01_2001_tot_inc', 'c01_2001_tot_exp', 'c01_2001_tot_work_p', 'c01_2001_mainwork_p', 'c01_2001_main_cl_p',
                  'c01_2001_main_al_p', 'c01_2001_main_hh_p',

                  'c11_2011_comh_cntr', 'c11_2011_ph_cntr', 'c11_2011_pr_school_dummy', 'c11_2011_pr_school_number',
                  'c11_2011_elec_light', 'c11_2011_kero_light', 'c11_2011_solar_light', 'c11_2011_other_light', 'c11_2011_another_light',
                  'c11_2011_nolight_light', 'c11_2011_cooking_electricity', 'c11_2011_power_domestic', 'c11_2011_power_agri',
                  'c11_2011_power_commercial', 'c11_2011_power_all', 'c11_2011_total_area',
                  'c11_2011_total_hhs', 'c11_2011_total_pop', 'c11_2011_total_sc_pop', 'c11_2011_total_st_pop', 'c11_2011_p_lit',
                  'c11_2011_asset_availing_bank', 'c11_2011_asset_radio_transitor', 'c11_2011_asset_tv',
                  'c11_2011_asset_landline_phone', 'c11_2011_asset_mobile_phone', 'c11_2011_asset_both_phone',
                  'c11_2011_asset_bike', 'c11_2011_asset_scooter_motor_moped', 'c11_2011_asset_car_jeep_van', 'c11_2011_asset_none']

census = census_full[name_variables]
census = census.replace({'': np.NaN}).replace({' ': np.NaN}).replace({'NA': np.NaN})
# Correcting a mistake in the village code
census['c01_2001_vill_code'] = census['c01_2001_vill_code'].replace('9e+05', '900000')
# converting some of the variables into floats, and the code variables in string
listout = ['c01_2001_st_code', 'c01_2001_dist_code', 'c01_2001_subdist_code', 'c01_2001_block_code', 'c01_2001_vill_code',
                  'c01_2001_vill_name', 'c11_2011_vill_code']
census.dtypes
for colname in census.columns:
    print colname, census[colname].dtype
    if (census[colname].dtype == np.float64 or census[colname].dtype == np.int64):
        pass
    elif colname in listout:
        census[colname] = census[colname].astype(np.str)
    else:
        census[colname] = census[colname].str.replace(',', '')
        census[colname] = pd.to_numeric(census[colname], errors='coerce')
        print colname, census[colname].dtype

# Cleaning the 2001 variables that are dummies. change anything different from 1 and 2 to NaN. then change 2 to 0
list_2001_dummies = ['c01_2001_edu_fac', 'c01_2001_medi_fac', 'c01_2001_drnk_wat_f', 'c01_2001_tap',
                      'c01_2001_p_t_fac', 'c01_2001_bs_fac', 'c01_2001_rs_fac', 'c01_2001_nw_fac',
                   'c01_2001_bank_fac', 'c01_2001_rc_fac',  'c01_2001_app_pr', 'c01_2001_power_supl',
                      'c01_2001_power_dom', 'c01_2001_power_agr', 'c01_2001_power_oth', 'c01_2001_power_all']
for name in list_2001_dummies:
    census[name] = census[name].replace(0, np.NaN)
    census[name] = census[name].replace(3, np.NaN)
    census[name] = census[name].replace(2, 0)

# cleaning the 2011 var that are dummies
list_2011_dummies = ['c11_2011_pr_school_dummy', 'c11_2011_power_domestic', 'c11_2011_power_agri', 'c11_2011_power_commercial',
                     'c11_2011_power_all']
for name in list_2011_dummies:
    census[name] = census[name].replace(0, np.NaN)
    census[name] = census[name].replace(3, np.NaN)
    census[name] = census[name].replace(4, np.NaN)
    census[name] = census[name].replace(16, np.NaN)
    census[name] = census[name].replace(2, 0)



# IMPORT FROM THERE
filepath = '/home/emd/Dropbox/Night Lights Validation (Eugenie, Ryan, Johannes)/Data/Python_code_merging/census_subset.csv'
#census.to_csv(filepath, sep=',', header=True, index=False, na_rep='')
census = pd.read_csv(filepath, header=0, na_values=['NA'], low_memory=False)
census = census.replace({'NA': np.NaN})
census.dtypes
listout = ['c01_2001_st_code', 'c01_2001_dist_code', 'c01_2001_subdist_code', 'c01_2001_block_code', 'c01_2001_vill_code',
                  'c01_2001_vill_name', 'c11_2011_vill_code']
for colname in listout:
    census[colname] = census[colname].astype(np.str)
census['c11_2011_vill_code'] = '000000' + census['c11_2011_vill_code']
census['c11_2011_vill_code'] = census['c11_2011_vill_code'].str[-6:]
# summary:
print census['c11_2011_vill_code'].describe() # 596855 / 596855 uniquely identified by c11_2011_vill_code


### MERGING MAIN DATA WITH CENSUS
# attach the concordance file to the data
print mydata['Vill11'].describe() # 649486 / 649486
print census['c11_2011_vill_code'].str.len().value_counts(), '\n', mydata['Vill11'].str.len().value_counts()
print census['c11_2011_vill_code'].dtype, '\n', mydata['Vill11'].dtype
# c11_2011_vill_code and Vill11 are both unique

mydata = pd.merge(mydata, census, how='outer', left_on='Vill11', right_on='c11_2011_vill_code',
                         suffixes=('_cen', '_con'), indicator=True)
mydata['_merge'].value_counts()
# both          596843
# left_only      52643
# right_only        12

mydata = mydata.rename(columns={'_merge': 'merge_census'})
mydata['merge_census'].cat.categories
mydata['merge_census'] = mydata['merge_census'].cat.rename_categories(['concNL_only', 'census_only', 'both_concNL_cen'])

##############################################################################################################
#####           MERGE TRACKING VARIABLE
##############################################################################################################
# Creating a variable that will indicate in which datasets that observation can be found ie either in all three (census_NL_RGGVY)
# in only census and NL (census_NL), ...

mydata['merge_NL_conc'].value_counts()
mydata['merge_census'].value_counts()

def source_fct(row):
    if row['merge_NL_conc'] == 'both_conc_NL' and row['merge_census'] == 'both_concNL_cen':
        return 'conc_NL_census'
    elif row['merge_NL_conc'] == 'both_conc_NL' and row['merge_census'] == 'concNL_only':
        return 'conc_NL'
    elif row['merge_NL_conc'] == 'conc_only' and row['merge_census'] == 'both_concNL_cen':
        return 'conc_census'
    elif row['merge_NL_conc'] == 'NL_only' and row['merge_census'] == 'both_concNL_cen':
        return 'NL_census'
    elif row['merge_NL_conc'] == 'conc_only' and row['merge_census'] == 'concNL_only':
        return 'conc'
    elif row['merge_NL_conc'] == 'NL_only' and row['merge_census'] == 'concNL_only':
        return 'NL'
    elif row['merge_census'] == 'census_only':
        return 'census'
    else:
        return None
mydata['Source'] = mydata.apply(source_fct, axis=1)
mydata['Source'].value_counts()
# conc_NL_census    596578
# conc_NL            46890
# NL                  3912
# conc                1841
# conc_census          262
# census                12
# NL_census              3
#

##############################################################################################################3
##        Village COORDINATES - (Lon Lat) - will need to merge on state-village code 2001
##############################################################################################################3
coordinates_file = '/home/emd/Dropbox/Night Lights Validation (Eugenie, Ryan, Johannes)/Data/Village_COORDINATES/coordinates.csv'
coordinates = pd.read_csv(coordinates_file, header=0, na_values=['NA'], low_memory=False)
coordinates.columns
coordinates = coordinates.drop_duplicates()

coordinates['C_CODE01'] = coordinates['C_CODE01'].astype(np.str)
coordinates['C_CODE01'].str.len().value_counts()
coordinates = coordinates[coordinates['C_CODE01'].str.len() == 16]
coordinates['C_CODE01'].describe() # 395190 / 395300. C_CODE01 not unique
coordinates = coordinates[~coordinates.duplicated(['C_CODE01'])]
coordinates['C_CODE01'].describe() # 395190 / 395190. C_CODE01 unique

# merging with concordance
# this will be a one-to-many merge.
# one obs in coordinates can match several in concordance
# this is because some 2001 villages were split in several 2011 villages
# merge on statevillage2001 code cause probably less mistakes the in the full code
coordinates['StateVill01'] = coordinates['C_CODE01'].str[:2] + coordinates['C_CODE01'].str[-8:]
coordinates['StateVill01'].str.len().value_counts()
coordinates['StateVill01'].describe() # 395177 / 395190
coordinates['StateVill01'].value_counts()

coordinates = coordinates[~coordinates['StateVill01'].isin(coordinates[coordinates.duplicated(['StateVill01'])]['StateVill01'])]
coordinates['StateVill01'].describe() # 395164 / 395164
concordance['StateVillCode01'].describe() # 635929 / 645571
print coordinates['StateVill01'].str.len().value_counts(), '\n', concordance['StateVillCode01'].str.len().value_counts()
print coordinates['StateVill01'].dtype, '\n', concordance['StateVillCode01'].dtype
# merge one-to-many!
coordinates = pd.merge(coordinates, concordance, how='outer', left_on='StateVill01', right_on='StateVillCode01',
                         suffixes=('_coor', '_concor'), indicator=True)
coordinates['_merge'].value_counts()
# both          396282
# right_only    249289
# left_only       2307

coordinates = coordinates[coordinates['_merge'] == 'both']
coordinates = coordinates[['latitude', 'longitude', 'StateVill01', 'Vill11']]
coordinates['StateVill01'].describe()

coordinates[coordinates['StateVill01'].isin(coordinates[coordinates.duplicated(['StateVill01'])]['StateVill01'])].sort_values(by='StateVill01')

# merging coordinates with main data
mydata['Vill11'].describe() # 649486 / 649486
coordinates['Vill11'].describe() # 396282 / 396282
print mydata['Vill11'].str.len().value_counts(), '\n', coordinates['Vill11'].str.len().value_counts()
print mydata['Vill11'].dtype, '\n', coordinates['Vill11'].dtype

mydata = pd.merge(mydata, coordinates, how='outer', left_on='Vill11', right_on='Vill11', indicator=True)
mydata['_merge'].value_counts()
# both          396282
# left_only     253216 only in main data
# right_only         0
mydata = mydata.drop(['_merge'], axis=1)
mydata = mydata.rename(columns={'StateVill01': 'StateVill01_coord'})


##############################################################################################################3
##          CLEANING RGGVY
##############################################################################################################3
import pandas as pd
import os
import numpy as np
# Setting the width of printing large enough
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_rows', 25)
pd.set_option('display.max_columns', 250)
rggvy_file = '/home/emd/Dropbox/Night Lights Validation (Eugenie, Ryan, Johannes)/Data/RGGVY/rggvy_CLEAN.dta'
RGGVY = pd.read_stata(rggvy_file)
RGGVY = RGGVY.replace({'': np.NaN}).replace({'NA': np.NaN}).replace({' ': np.NaN})
RGGVY.columns
RGGVY.dtypes

# Constructing a 2001 state-village code
RGGVY['RGGVY_vill_cd'] = RGGVY['RGGVY_vill_cd'].astype(np.str)
RGGVY['RGGVY_vill_cd'] = '00000000' + RGGVY['RGGVY_vill_cd']
RGGVY['RGGVY_vill_cd'] = RGGVY['RGGVY_vill_cd'].str[-8:]
RGGVY['state_cd'] = RGGVY['state_cd'].astype(np.str)
RGGVY['state_cd'] = '00' + RGGVY['state_cd']
RGGVY['state_cd'] = RGGVY['state_cd'].str[-2:]
RGGVY['StateVillCode01'] = RGGVY['state_cd'] + RGGVY['RGGVY_vill_cd']
RGGVY['StateVillCode01'].str.len().value_counts() # all the codes have 16 digits and are strings
RGGVY['StateVillCode01'].value_counts() # there are duplicates
RGGVY['StateVillCode01'].describe() # there are duplicates # 360651 total count, 360473 unique (diff = 178)
RGGVY = RGGVY[~RGGVY.duplicated(['StateVillCode01'])]
RGGVY['StateVillCode01'].describe() # 360473 / 360473
# keep one observation out of the duplicates

# Cleaning RGGVY variables
# elect_status: E, U or D  Coverage Category(Un-Electrified/De-Electrified/Electrified)
# works_stat: Y, N, y, 0, T    Whether all the works have been completed
RGGVY['elect_status'] = RGGVY['elect_status'].replace('E', 1)
RGGVY['elect_status'] = RGGVY['elect_status'].replace('U', 0)
RGGVY['elect_status'] = RGGVY['elect_status'].replace('D', -1)

RGGVY['works_stat'] = RGGVY['works_stat'].replace('y', 'Y')
RGGVY['works_stat'] = RGGVY['works_stat'].replace('0', 'N')
RGGVY['works_stat'] = RGGVY['works_stat'].replace('T', np.NaN)
RGGVY['works_stat'] = RGGVY['works_stat'].replace('Y', 1)
RGGVY['works_stat'] = RGGVY['works_stat'].replace('N', 0)

##          USING THE CONCORDANCE FILE TO ASSOCIATE RGGVY OBSERVATION TO 2011 CODES
RGGVY['StateVillCode01'].describe() # 360473 / 360473
concordance['StateVillCode01'].describe() # 635929 / 645571
print RGGVY['StateVillCode01'].str.len().value_counts(), '\n', concordance['StateVillCode01'].str.len().value_counts()
print RGGVY['StateVillCode01'].dtype, '\n', concordance['StateVillCode01'].dtype

RGGVY = pd.merge(RGGVY, concordance, how='outer', left_on='StateVillCode01', right_on='StateVillCode01', indicator=True)
RGGVY['_merge'].value_counts()
# both          363086
# right_only    282485 only in concordance
# left_only       3173 only in RGGVY
RGGVY = RGGVY[RGGVY['_merge']== 'both']
coltokeep = ['elect_status', 'works_stat', 'StateVillCode01', 'Vill11']
RGGVY = RGGVY[coltokeep]
RGGVY = RGGVY.rename(columns = {'elect_status': 'RGGVY_elect_status', 'works_stat': 'RGGVY_works_stat'})
RGGVY['Vill11'].describe() # 363086 /363086

##          MERGING rggvy with main data
# merge rggvy and censusNL data on 2011 village codes
mydata['Vill11'].describe() # 649486 / 649486
RGGVY['Vill11'].describe() # 363086 / 363086
print RGGVY['Vill11'].str.len().value_counts(), '\n', mydata['Vill11'].str.len().value_counts()
print RGGVY['Vill11'].dtype, '\n', mydata['Vill11'].dtype

mydata = pd.merge(mydata, RGGVY, how='outer', left_on='Vill11', right_on='Vill11', indicator=True)
mydata['_merge'].value_counts()
# both          363086
# left_only     286412
# right_only         0

# create a dummy variable: RGGVY_dummy
# if not in RGGVY dataset, did not receive RGGVY
mydata['RGGVY_dummy'] = np.NaN
mydata['RGGVY_dummy'][mydata['_merge'] != 'left_only'] = 1
mydata['RGGVY_dummy'][mydata['_merge'] == 'left_only'] = 0

mydata = mydata.rename(columns={'_merge': 'merge_RGGVY'})
mydata['merge_RGGVY'].cat.categories
mydata['merge_RGGVY'] = mydata['merge_RGGVY'].cat.rename_categories(['censusNL_only', 'RGGVY_only', 'in_both'])


##############################################################################################################3
##          Cleaning up final data
##############################################################################################################3
# creating codes for district, subdistrict for fixed effects
mydata['St11'].str.len().value_counts()
mydata['Dis11'].str.len().value_counts()
mydata['Subd11'].str.len().value_counts()
mydata['Dis11_fe'] = mydata['St11'] + mydata['Dis11']
mydata['Subd11_fe'] = mydata['St11'] + mydata['Dis11'] + mydata['Subd11']
mydata = mydata.rename(columns={'StateVillCode01_x': 'StateVillCode01', 'FullCode01_conc': 'FullCode01'})


mydata.columns.values.tolist()

nl_list = mydata.columns.values.tolist()[mydata.columns.values.tolist().index('SH_count_11'):mydata.columns.values.tolist().index('NL_source')]
nl_list.remove('FullCode01_nl')

columns_ordered = ['Source', 'NL_source', 'St01', 'Dis01', 'Subd01', 'Vill01', 'Name01', 'St11', 'Dis11',
'Subd11', 'Vill11', 'Name11', 'StateVillCode01', 'StateVillCode11', 'FullCode01', 'FullCode11', 'Dis11_fe', 'Subd11_fe' ] + nl_list + ['RGGVY_elect_status', 'RGGVY_works_stat', 'RGGVY_dummy',

'latitude', 'longitude', 'c01_2001_area', 'c11_2011_total_area', 'c01_2001_dist_town',

'c01_2001_t_hh', 'c01_2001_t_p', 'c01_2001_sc_p', 'c01_2001_st_p',
'c11_2011_total_hhs', 'c11_2011_total_pop', 'c11_2011_total_sc_pop', 'c11_2011_total_st_pop',
"c01_2001_p_lit" , 'c11_2011_p_lit',
'c01_2001_power_supl', 'c11_2011_power_all',
"c11_2011_asset_availing_bank", "c11_2011_asset_radio_transitor", "c11_2011_asset_tv",
"c11_2011_asset_landline_phone", "c11_2011_asset_mobile_phone", "c11_2011_asset_both_phone",
"c11_2011_asset_bike", "c11_2011_asset_scooter_motor_moped", "c11_2011_asset_car_jeep_van", "c11_2011_asset_none",
'c01_2001_bank_fac', 'c01_2001_medi_fac',
'c11_2011_elec_light', 'c11_2011_kero_light', 'c11_2011_solar_light', 'c11_2011_other_light',
'c11_2011_another_light', 'c11_2011_nolight_light', 'c11_2011_cooking_electricity',

'c01_2001_st_code', 'c01_2001_dist_code', 'c01_2001_subdist_code', 'c01_2001_block_code', 'c01_2001_vill_code', 'c11_2011_vill_code']

mydata = mydata[columns_ordered]
mydata = mydata.replace({'': np.NaN}).replace({'NA': np.NaN})
mydata = mydata.sort_values(by='Vill11', ascending = True)


mydata.columns
# converting the identifier variables into floats
list_identifiers = ['St11', 'Dis11', 'Subd11', 'Vill11', 'Dis11_fe', 'Subd11_fe']

for colname in mydata.columns:
    print colname, mydata[colname].dtype
    if colname in list_identifiers:
        mydata[colname] = pd.to_numeric(mydata[colname], errors='coerce') #coerce
    if mydata[colname].dtype == 'O':
        mydata[colname] = mydata[colname].values.astype('unicode')
    print colname, mydata[colname].dtype


# reimporting data
mydatafile = '/home/emd/Dropbox/Night Lights Validation (Eugenie, Ryan, Johannes)/Data/Python_code_merging/Data_Merged_7_05.dta'
mydata = pd.read_stata(mydatafile)
print('a')

mydata['electrified_hh_nbr'].describe()


mydata['electrified_hh_per'] = mydata.c11_2011_elec_light
mydata['electrified_hh_nbr'] = mydata.electrified_hh_per * mydata.c11_2011_total_hhs / 100
mydata['electrified_hh_nbr'].describe()

list(mydata.columns.values).index('RGGVY_elect_status')
columns_list = list(mydata.columns.values)[:-2]
columns_list.insert(columns_list.index('RGGVY_elect_status'), list(mydata.columns.values)[-2])
columns_list.insert(columns_list.index('RGGVY_elect_status'), list(mydata.columns.values)[-1])
columns_list.remove('c11_2011_elec_light')
mydata = mydata[columns_list]


for varname in mydata.columns:
    print varname
    print mydata[varname].describe()
    print '\n \n'

# there are some negative values in c01_2001_dist_town
mydata['c01_2001_dist_town'] = mydata.apply(lambda x :
                                                    x['c01_2001_dist_town'] if x['c01_2001_dist_town'] >= 0
                                                    else -x['c01_2001_dist_town'], axis=1)
mydata['c01_2001_dist_town'].describe()

###################################################################3
# Adding distance to city
# reimporting data
mydatafile = '/home/emd/Dropbox/Night Lights Validation (Eugenie, Ryan, Johannes)/Data/Python_code_merging/Data_Merged_8_13.dta'
mydata = pd.read_stata(mydatafile)
print('a')
mydata['FullCode11'].str.len().describe()

mydata[mydata['FullCode11'].str.len() != 16]
mydata[mydata['FullCode11'].str.len() == 16]



path_city = '/home/emd/Dropbox/Night Lights Validation (Eugenie, Ryan, Johannes)/Data/Python_code_merging/ClosestCity.csv'
city = pd.read_csv(path_city, dtype=str)
city['Distance'] = city['Distance'].astype(np.float64)
del city['Unnamed: 0']
city = city[city['CENSUS2011'].str.len() == 16]
# only one observation with a 15 digit code so all of them shoudl have 16

mydata2 = pd.merge(mydata, city, how='outer', left_on='FullCode11', right_on='CENSUS2011',
                         suffixes=('_d', '_c'), indicator=True)

mydata2['_merge'].value_counts()
# both          626568
# left_only      22962
# right_only      4333
del city['CENSUS2011']
del mydata2['_merge']
mydata2['closest_city'] = mydata2['closest_city'].values.astype('unicode')


##############################################################################################################3
##          Outputting to Stata and R
##############################################################################################################3
# export to stata
os.chdir('/home/emd/Dropbox/Night Lights Validation (Eugenie, Ryan, Johannes)/Data/Python_code_merging/')
mydata2.to_stata('Data_Merged_11_20.dta', write_index=False)
print('saved')

Obs_inCensus_notNL = mydata[mydata['Source'] == 'conc_census']
filepath = '/home/emd/Dropbox/Night Lights Validation (Eugenie, Ryan, Johannes)/Data/Python_code_merging/Obs_inCensus_notNL.csv'
Obs_inCensus_notNL.to_csv(filepath, sep=',', header=True, index=False, na_rep='')


# reimporting data
mydatafile = '/home/emd/Dropbox/Night Lights Validation (Eugenie, Ryan, Johannes)/Data/Python_code_merging/Data_Merged_11_20_stata.dta'
mydata = pd.read_stata(mydatafile)






